# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
sb.set_style("darkgrid")
# load in the dataset into a pandas dataframe, print the head
prosper_loan = pd.read_csv('prosperLoanData.csv')
prosper_loan.head(10)
| ListingKey | ListingNumber | ListingCreationDate | CreditGrade | Term | LoanStatus | ClosedDate | BorrowerAPR | BorrowerRate | LenderYield | ... | LP_ServiceFees | LP_CollectionFees | LP_GrossPrincipalLoss | LP_NetPrincipalLoss | LP_NonPrincipalRecoverypayments | PercentFunded | Recommendations | InvestmentFromFriendsCount | InvestmentFromFriendsAmount | Investors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1021339766868145413AB3B | 193129 | 2007-08-26 19:09:29.263000000 | C | 36 | Completed | 2009-08-14 00:00:00 | 0.16516 | 0.1580 | 0.1380 | ... | -133.18 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 258 |
| 1 | 10273602499503308B223C1 | 1209647 | 2014-02-27 08:28:07.900000000 | NaN | 36 | Current | NaN | 0.12016 | 0.0920 | 0.0820 | ... | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 1 |
| 2 | 0EE9337825851032864889A | 81716 | 2007-01-05 15:00:47.090000000 | HR | 36 | Completed | 2009-12-17 00:00:00 | 0.28269 | 0.2750 | 0.2400 | ... | -24.20 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 41 |
| 3 | 0EF5356002482715299901A | 658116 | 2012-10-22 11:02:35.010000000 | NaN | 36 | Current | NaN | 0.12528 | 0.0974 | 0.0874 | ... | -108.01 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 158 |
| 4 | 0F023589499656230C5E3E2 | 909464 | 2013-09-14 18:38:39.097000000 | NaN | 36 | Current | NaN | 0.24614 | 0.2085 | 0.1985 | ... | -60.27 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 20 |
| 5 | 0F05359734824199381F61D | 1074836 | 2013-12-14 08:26:37.093000000 | NaN | 60 | Current | NaN | 0.15425 | 0.1314 | 0.1214 | ... | -25.33 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 1 |
| 6 | 0F0A3576754255009D63151 | 750899 | 2013-04-12 09:52:56.147000000 | NaN | 36 | Current | NaN | 0.31032 | 0.2712 | 0.2612 | ... | -22.95 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 1 |
| 7 | 0F1035772717087366F9EA7 | 768193 | 2013-05-05 06:49:27.493000000 | NaN | 36 | Current | NaN | 0.23939 | 0.2019 | 0.1919 | ... | -69.21 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 1 |
| 8 | 0F043596202561788EA13D5 | 1023355 | 2013-12-02 10:43:39.117000000 | NaN | 36 | Current | NaN | 0.07620 | 0.0629 | 0.0529 | ... | -16.77 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 1 |
| 9 | 0F043596202561788EA13D5 | 1023355 | 2013-12-02 10:43:39.117000000 | NaN | 36 | Current | NaN | 0.07620 | 0.0629 | 0.0529 | ... | -16.77 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 1 |
10 rows × 81 columns
#filter the Dataset to the coulmn that i want:
prosper_loan.columns
Index(['ListingKey', 'ListingNumber', 'ListingCreationDate', 'CreditGrade',
'Term', 'LoanStatus', 'ClosedDate', 'BorrowerAPR', 'BorrowerRate',
'LenderYield', 'EstimatedEffectiveYield', 'EstimatedLoss',
'EstimatedReturn', 'ProsperRating (numeric)', 'ProsperRating (Alpha)',
'ProsperScore', 'ListingCategory (numeric)', 'BorrowerState',
'Occupation', 'EmploymentStatus', 'EmploymentStatusDuration',
'IsBorrowerHomeowner', 'CurrentlyInGroup', 'GroupKey',
'DateCreditPulled', 'CreditScoreRangeLower', 'CreditScoreRangeUpper',
'FirstRecordedCreditLine', 'CurrentCreditLines', 'OpenCreditLines',
'TotalCreditLinespast7years', 'OpenRevolvingAccounts',
'OpenRevolvingMonthlyPayment', 'InquiriesLast6Months', 'TotalInquiries',
'CurrentDelinquencies', 'AmountDelinquent', 'DelinquenciesLast7Years',
'PublicRecordsLast10Years', 'PublicRecordsLast12Months',
'RevolvingCreditBalance', 'BankcardUtilization',
'AvailableBankcardCredit', 'TotalTrades',
'TradesNeverDelinquent (percentage)', 'TradesOpenedLast6Months',
'DebtToIncomeRatio', 'IncomeRange', 'IncomeVerifiable',
'StatedMonthlyIncome', 'LoanKey', 'TotalProsperLoans',
'TotalProsperPaymentsBilled', 'OnTimeProsperPayments',
'ProsperPaymentsLessThanOneMonthLate',
'ProsperPaymentsOneMonthPlusLate', 'ProsperPrincipalBorrowed',
'ProsperPrincipalOutstanding', 'ScorexChangeAtTimeOfListing',
'LoanCurrentDaysDelinquent', 'LoanFirstDefaultedCycleNumber',
'LoanMonthsSinceOrigination', 'LoanNumber', 'LoanOriginalAmount',
'LoanOriginationDate', 'LoanOriginationQuarter', 'MemberKey',
'MonthlyLoanPayment', 'LP_CustomerPayments',
'LP_CustomerPrincipalPayments', 'LP_InterestandFees', 'LP_ServiceFees',
'LP_CollectionFees', 'LP_GrossPrincipalLoss', 'LP_NetPrincipalLoss',
'LP_NonPrincipalRecoverypayments', 'PercentFunded', 'Recommendations',
'InvestmentFromFriendsCount', 'InvestmentFromFriendsAmount',
'Investors'],
dtype='object')
choozen_col = ['ListingNumber', 'ListingCreationDate','ListingCategory (numeric)', 'Term', 'LoanOriginalAmount',
'MonthlyLoanPayment', 'EstimatedLoss','LenderYield', 'LoanStatus', 'ProsperRating (Alpha)', 'ProsperScore',
'EmploymentStatus', 'IncomeRange', 'StatedMonthlyIncome']
loan_filter = prosper_loan[choozen_col]
loan_filter
| ListingNumber | ListingCreationDate | ListingCategory (numeric) | Term | LoanOriginalAmount | MonthlyLoanPayment | EstimatedLoss | LenderYield | LoanStatus | ProsperRating (Alpha) | ProsperScore | EmploymentStatus | IncomeRange | StatedMonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 193129 | 2007-08-26 19:09:29.263000000 | 0 | 36 | 9425 | 330.43 | NaN | 0.1380 | Completed | NaN | NaN | Self-employed | $25,000-49,999 | 3083.333333 |
| 1 | 1209647 | 2014-02-27 08:28:07.900000000 | 2 | 36 | 10000 | 318.93 | 0.0249 | 0.0820 | Current | A | 7.0 | Employed | $50,000-74,999 | 6125.000000 |
| 2 | 81716 | 2007-01-05 15:00:47.090000000 | 0 | 36 | 3001 | 123.32 | NaN | 0.2400 | Completed | NaN | NaN | Not available | Not displayed | 2083.333333 |
| 3 | 658116 | 2012-10-22 11:02:35.010000000 | 16 | 36 | 10000 | 321.45 | 0.0249 | 0.0874 | Current | A | 9.0 | Employed | $25,000-49,999 | 2875.000000 |
| 4 | 909464 | 2013-09-14 18:38:39.097000000 | 2 | 36 | 15000 | 563.97 | 0.0925 | 0.1985 | Current | D | 4.0 | Employed | $100,000+ | 9583.333333 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 113932 | 753087 | 2013-04-14 05:55:02.663000000 | 1 | 36 | 10000 | 364.74 | 0.0699 | 0.1764 | Current | C | 5.0 | Employed | $50,000-74,999 | 4333.333333 |
| 113933 | 537216 | 2011-11-03 20:42:55.333000000 | 7 | 36 | 2000 | 65.57 | 0.0200 | 0.1010 | FinalPaymentInProgress | A | 8.0 | Employed | $75,000-99,999 | 8041.666667 |
| 113934 | 1069178 | 2013-12-13 05:49:12.703000000 | 1 | 60 | 10000 | 273.35 | 0.1025 | 0.2050 | Current | D | 3.0 | Employed | $25,000-49,999 | 2875.000000 |
| 113935 | 539056 | 2011-11-14 13:18:26.597000000 | 2 | 60 | 15000 | 449.55 | 0.0850 | 0.2505 | Completed | C | 5.0 | Full-time | $25,000-49,999 | 3875.000000 |
| 113936 | 1140093 | 2014-01-15 09:27:37.657000000 | 1 | 36 | 2000 | 64.90 | 0.0299 | 0.0939 | Current | A | 7.0 | Employed | $50,000-74,999 | 4583.333333 |
113937 rows × 14 columns
loan_filter.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 113937 entries, 0 to 113936 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingNumber 113937 non-null int64 1 ListingCreationDate 113937 non-null object 2 ListingCategory (numeric) 113937 non-null int64 3 Term 113937 non-null int64 4 LoanOriginalAmount 113937 non-null int64 5 MonthlyLoanPayment 113937 non-null float64 6 EstimatedLoss 84853 non-null float64 7 LenderYield 113937 non-null float64 8 LoanStatus 113937 non-null object 9 ProsperRating (Alpha) 84853 non-null object 10 ProsperScore 84853 non-null float64 11 EmploymentStatus 111682 non-null object 12 IncomeRange 113937 non-null object 13 StatedMonthlyIncome 113937 non-null float64 dtypes: float64(5), int64(4), object(5) memory usage: 12.2+ MB
#change the datatype
loan_filter['ListingNumber'] = loan_filter['ListingNumber'].astype('object')
C:\Users\youdi\AppData\Local\Temp\ipykernel_15864\455395077.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
loan_filter['ListingNumber'] = loan_filter['ListingNumber'].astype('object')
loan_filter.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 113937 entries, 0 to 113936 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingNumber 113937 non-null object 1 ListingCreationDate 113937 non-null object 2 ListingCategory (numeric) 113937 non-null int64 3 Term 113937 non-null int64 4 LoanOriginalAmount 113937 non-null int64 5 MonthlyLoanPayment 113937 non-null float64 6 EstimatedLoss 84853 non-null float64 7 LenderYield 113937 non-null float64 8 LoanStatus 113937 non-null object 9 ProsperRating (Alpha) 84853 non-null object 10 ProsperScore 84853 non-null float64 11 EmploymentStatus 111682 non-null object 12 IncomeRange 113937 non-null object 13 StatedMonthlyIncome 113937 non-null float64 dtypes: float64(5), int64(3), object(6) memory usage: 12.2+ MB
# change ListingCreationDate to datetime dtype
loan_filter['ListingCreationDate'] = pd.to_datetime(loan_filter['ListingCreationDate'])
C:\Users\youdi\AppData\Local\Temp\ipykernel_15864\3590065032.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy loan_filter['ListingCreationDate'] = pd.to_datetime(loan_filter['ListingCreationDate'])
loan_filter[loan_filter['ProsperScore'].isna()].ListingCreationDate.dt.year.value_counts()
2007 11557 2008 11263 2006 6213 2009 28 2005 23 Name: ListingCreationDate, dtype: int64
I have 3 columns with the same number of non-null, and I check the year of the null values, and I see that this loan listing creation date is before July 2009, I will filter the data to be after July 2009
loan_filter = loan_filter[loan_filter['ListingCreationDate'] >= '07/01/2009'].reset_index()
loan_filter.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 84853 entries, 0 to 84852 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 index 84853 non-null int64 1 ListingNumber 84853 non-null object 2 ListingCreationDate 84853 non-null datetime64[ns] 3 ListingCategory (numeric) 84853 non-null int64 4 Term 84853 non-null int64 5 LoanOriginalAmount 84853 non-null int64 6 MonthlyLoanPayment 84853 non-null float64 7 EstimatedLoss 84853 non-null float64 8 LenderYield 84853 non-null float64 9 LoanStatus 84853 non-null object 10 ProsperRating (Alpha) 84853 non-null object 11 ProsperScore 84853 non-null float64 12 EmploymentStatus 84853 non-null object 13 IncomeRange 84853 non-null object 14 StatedMonthlyIncome 84853 non-null float64 dtypes: datetime64[ns](1), float64(5), int64(4), object(5) memory usage: 9.7+ MB
#add a non numaric listing category
category = {0:'Not Available', 1:'Debt Consolidation', 2:'Home Improvement', 3:'Business', 4:'Personal Loan',
5:'Student Use', 6:'Auto', 7:'Other', 8:'Baby&Adoption', 9:'Boat', 10:'Cosmetic Procedure',
11:'Engagement Ring', 12:'Green Loans', 13:'Household Expenses', 14:'Large Purchases', 15:'Medical/Dental',
16:'Motorcycle', 17:'RV', 18:'Taxes', 19:'Vacation', 20:'Wedding Loans'}
category_list = []
for num in loan_filter['ListingCategory (numeric)']:
category_list.append(category[num])
loan_filter['ListingCategory'] = category_list
loan_filter.drop(columns='ListingCategory (numeric)', inplace=True)
#change ProsperRating (Alpha) and IncomeRange dtype and remove the space from columns name:
order_dict = {'ProsperRating (Alpha)':['HR', 'E', 'D', 'C', 'B', 'A', 'AA'],
'IncomeRange': ['Not employed', '$0', '$1-24,999', '$25,000-49,999',
'$50,000-74,999', '$75,000-99,999', '$100,000+']}
for col in order_dict:
loan_filter[col] = loan_filter[col].astype(
pd.api.types.CategoricalDtype(categories=order_dict[col], ordered=True))
loan_filter.rename(columns = {'ProsperRating (Alpha)':'ProsperRating'}, inplace = True)
loan_filter.describe()
| index | Term | LoanOriginalAmount | MonthlyLoanPayment | EstimatedLoss | LenderYield | ProsperScore | StatedMonthlyIncome | |
|---|---|---|---|---|---|---|---|---|
| count | 84853.000000 | 84853.000000 | 84853.000000 | 84853.000000 | 84853.000000 | 84853.000000 | 84853.000000 | 8.485300e+04 |
| mean | 56958.246850 | 42.486135 | 9083.440515 | 291.930720 | 0.080306 | 0.186017 | 5.950067 | 5.931175e+03 |
| std | 32885.220638 | 11.640346 | 6287.860058 | 186.678314 | 0.046764 | 0.074631 | 2.376501 | 8.239944e+03 |
| min | 1.000000 | 12.000000 | 1000.000000 | 0.000000 | 0.004900 | 0.030000 | 1.000000 | 0.000000e+00 |
| 25% | 28507.000000 | 36.000000 | 4000.000000 | 157.330000 | 0.042400 | 0.125900 | 4.000000 | 3.434000e+03 |
| 50% | 56960.000000 | 36.000000 | 7500.000000 | 251.940000 | 0.072400 | 0.177500 | 6.000000 | 5.000000e+03 |
| 75% | 85420.000000 | 60.000000 | 13500.000000 | 388.350000 | 0.112000 | 0.247400 | 8.000000 | 7.083333e+03 |
| max | 113936.000000 | 60.000000 | 35000.000000 | 2251.510000 | 0.366000 | 0.340000 | 11.000000 | 1.750003e+06 |
loan_filter.columns
Index(['index', 'ListingNumber', 'ListingCreationDate', 'Term',
'LoanOriginalAmount', 'MonthlyLoanPayment', 'EstimatedLoss',
'LenderYield', 'LoanStatus', 'ProsperRating', 'ProsperScore',
'EmploymentStatus', 'IncomeRange', 'StatedMonthlyIncome',
'ListingCategory'],
dtype='object')
my loan_filter dataset has 14 columns and 84,853 rows starting from July 2009:
I want to see which factor has an impact on the Lender Yield
I think that features that reflect the risk that they can lose many, loan original amount and term
I want to see if the distribution between Lender yield and the actual many that they have gone earn it's the same.
# I add a profit columns
loan_filter['profit($)'] = loan_filter.LenderYield * loan_filter.LoanOriginalAmount
fig, axes = plt.subplots(2, 2, figsize=[12, 10], dpi=250)
fig.suptitle('Lender Yield Distribution', fontsize=14)
sb.histplot(data=loan_filter, x='LenderYield', ax=axes[0][0])
sb.boxplot(data=loan_filter, x='LenderYield', ax=axes[0][1])
sb.histplot(data=loan_filter, x='profit($)', ax=axes[1][0])
sb.boxplot(data=loan_filter, x='profit($)', ax=axes[1][1])
axes[0][0].set_title('Lender Yield Histogram Plot')
axes[0][1].set_title('Lender Yield Box Plot')
axes[1][0].set_title('Profit (lender yield * loan original amount) Histogram Plot')
axes[1][1].set_title('Profit Box Plot')
axes[0][0].set_xticks([0, 0.1, 0.2, 0.3, 0.4, 0.5], [0, '10%', '20%', '30%', '40%', '50%'])
axes[0][1].set_xticks([0, 0.1, 0.2, 0.3, 0.4, 0.5], [0, '10%', '20%', '30%', '40%', '50%'])
plt.savefig("images/LenderYield.png");
from the histogram and the box plot we could see that the lender yield approximates a normal distribution with some outliers, and the profit is right skewed, with a lot of outliers.
# bar plot for the 10 frequents Purpose Of The Loan
def plot1(data, y, y_label, title):
"ploting ordered count plot of y"
listing_order = list(data[y].value_counts().index)[:10]
plt.figure(figsize=[12,7])
sb.countplot(data=data, y=y, order=listing_order, palette='mako')
plt.title(title, fontsize=16)
plt.ylabel(y_label);
plot1(data=loan_filter, y='ListingCategory', y_label='Purpose Of The Loan', title='10 frequents Purpose Of The Loan')
we see that the most common reason to take a loan is debt consolidation.
import re
fig, axes = plt.subplots(1, 3, figsize=(12, 6), dpi=300)
fig.suptitle('Refundability', fontsize=16)
fig.supylabel('Count')
for i, col in enumerate(['ProsperRating', 'IncomeRange', 'EmploymentStatus']):
sb.countplot(data=loan_filter, x=col, palette="mako",ax=axes[i])
axes[i].bar_label(axes[i].containers[0])
axes[i].set_yticks([])
axes[i].set(yticklabels=[])
axes[i].set_xlabel('')
axes[i].set_ylabel('')
axes[i].set_title(f"{' '.join(re.findall('[A-Z][^A-Z]*', col))} count plot")
axes[i].tick_params(axis='x', labelrotation=60)
plt.tight_layout()
axes[0].tick_params(axis='x', labelrotation=0)
plt.savefig("images/Refundability");
plt.figure(figsize=[12, 6], dpi=250)
plt.subplot(1,2,1)
sb.histplot(data=loan_filter, x='LoanOriginalAmount', bins=30, kde=True)
plt.xlim(0)
plt.xlabel('Loan Original Amount($)')
plt.title('Loan Original Amount histogram plot')
plt.subplot(1,2,2)
sb.histplot(data=loan_filter, x='LoanOriginalAmount', log_scale=True, bins=30, kde=True)
plt.xticks([1e3, 2e3, 5e3, 1e4, 2e4], ["1k", "2k", "5k", "10k", "20k"])
plt.xlabel('Loan Original Amount($)')
plt.title('Loan Original Amount histogram - log scale');
from the histogram, we could see that the loan original amount is right skewed, in the log scale the distribution looks roughly bimodal, with one peak between 2500 and 5000, and a second peak between 10,000 and 13,000
refundability_vars = ['ProsperRating', 'IncomeRange', 'EmploymentStatus']
numerical_vars = ['Term', 'LoanOriginalAmount', 'EstimatedLoss', 'ProsperScore', 'LenderYield',
'MonthlyLoanPayment', 'StatedMonthlyIncome']
fig, axes = plt.subplots(3, 2, figsize=[8, 8], dpi=250)
fig.suptitle('Refundability Vs Lender Yield', fontsize=16)
for i, col in enumerate(refundability_vars):
sb.barplot(data=loan_filter, x=col, y='LenderYield', ax=axes[i][0], palette='mako')
sb.boxplot(data=loan_filter, y='LenderYield', x=col, ax=axes[i][1], palette='RdPu')
axes[i][1].set_ylabel('')
axes[i][0].tick_params(axis='x', labelrotation=60)
axes[i][1].tick_params(axis='x', labelrotation=60)
axes[i][0].set_yticks([0, 0.1, 0.2, 0.3, 0.4], [0, '10%', '20%', '30%', '40%'])
axes[i][1].set_yticks([0, 0.1, 0.2, 0.3, 0.4], [0, '10%', '20%', '30%', '40%'])
plt.tight_layout()
plt.savefig("images/Ref_len");
we could see that in the first two rows, it's very clear that low-income or rate leading highest lender yield percent but in the last row the only thing that we could see is that not employed have the highest lender yield percent.
fig, axes = plt.subplots(3, 2, figsize=[8, 8], dpi=250)
fig.suptitle('Refundability Vs Profit', fontsize=16)
for i, col in enumerate(refundability_vars):
sb.barplot(data=loan_filter, x=col, y='profit($)', ax=axes[i][0], palette='mako')
sb.boxplot(data=loan_filter, y='profit($)', x=col, ax=axes[i][1], palette='RdPu')
axes[i][1].set_ylabel('')
axes[i][0].tick_params(axis='x', labelrotation=60)
axes[i][1].tick_params(axis='x', labelrotation=60)
plt.tight_layout()
In these plots, we don't see that when the refundability is lower the profit is higher, maybe it's because that lower refundability leads to the lower amount of the original loan, I will check it in the next plot.
fig, axes = plt.subplots(2, 1, figsize=[10, 5], dpi=250)
fig.suptitle('Refundability Vs LoanOriginalAmount', fontsize=12)
for i, col in enumerate(['ProsperRating', 'IncomeRange']):
sb.barplot(data=loan_filter, x=col, y='LoanOriginalAmount', ax=axes[i], palette='mako')
axes[i].tick_params(axis='x', labelrotation=60)
axes[i].set_ylabel('Loan Original Amount($)')
this plot explains the plot above, that lower refundability leads to a lower loan original amount (except a 0$ income bar that is higher from higher income)
plt.figure(figsize=[9, 2], dpi=300)
yield_corr = pd.DataFrame(loan_filter[numerical_vars].corr()['LenderYield']).transpose().drop(columns='LenderYield')
plt.title('Correlation Between Lender Yield And Numerical Variables.')
plt.xticks(rotation=30)
sb.heatmap(yield_corr, annot=True, center=0, cmap='twilight_shifted', fmt='.2f');
we see a strong positive correlation between lender yield and estimated loss, a moderate positive correlation between prosper score and lender yield and a moderate weak between lender yield and LoanOriginalAmount
plt.figure(figsize=[18, 7], dpi=300)
plt.suptitle('Estimated Loss Vs Lender Yield', fontsize=16)
plt.subplot(1, 2, 1)
plt.scatter(data=loan_filter, x='EstimatedLoss', y='LenderYield', alpha=0.05)
plt.ylabel('Lender Yield')
plt.xlabel('Estimated Loss')
plt.subplot(1 , 2, 2)
plt.hist2d(data=loan_filter, x='EstimatedLoss', y='LenderYield', cmap='plasma_r', bins=[20, 20], cmin=0.5)
plt.colorbar()
plt.xlabel('Estimated Loss');
plt.figure(figsize=[6, 6])
sb.kdeplot(data=loan_filter, x='ProsperScore', y='LenderYield', shade=True, thresh=0.05, cbar=True)
plt.title('Prosper Score Vs Lender Yield');
we can see that these factors have a larger impact on the lender yield, so we went to check if we have a connection between them.
fig, axes = plt.subplots(3, 2, figsize=[8, 8], dpi=250)
fig.suptitle('Estimated Loss Vs Refundability', fontsize=16)
for i, col in enumerate(refundability_vars):
sb.barplot(data=loan_filter, x=col, y='EstimatedLoss', ax=axes[i][0], palette='mako')
sb.boxplot(data=loan_filter, y='EstimatedLoss', x=col, ax=axes[i][1], palette='RdPu')
axes[i][1].set_ylabel('')
axes[i][0].tick_params(axis='x', labelrotation=60)
axes[i][1].tick_params(axis='x', labelrotation=60)
axes[i][0].set_yticks([0, 0.1, 0.2, 0.3, 0.4], [0, '10%', '20%', '30%', '40%'])
axes[i][1].set_yticks([0, 0.1, 0.2, 0.3, 0.4], [0, '10%', '20%', '30%', '40%'])
plt.tight_layout()
loan_filter['month_year'] = pd.to_datetime(loan_filter['ListingCreationDate']).dt.to_period('M')
mean_loan_month = pd.DataFrame(loan_filter.groupby('month_year')['LenderYield'].mean()).reset_index()
plt.figure(figsize=[9, 4], dpi=300)
sb.lineplot(x=mean_loan_month['month_year'].astype(str), y=mean_loan_month['LenderYield'])
plt.xticks(rotation=90)
plt.title('Lender Yield over time');
we can see that at the end of the year, we have a downward trend and the lender yield decreased in the last year.
The main things that impact the lender yield are:
refundability (ProsperRating, EmploymentStatus, IncomeRange), when the refundability is higher, the lender yield is lower.
EstimatedLoss has a strong positive correlation.
we see that these factors have a connection between them, when the refundability is higher, the Estimated Loss is lower.
I found that when refundability is higher, the loan original amount is higher.
plt.figure(figsize=[9, 4], dpi=300)
sb.scatterplot(data=loan_filter, x='EstimatedLoss', y='LenderYield', alpha=0.05, hue='ProsperRating', palette='RdYlGn')
plt.title('Comparing Lender Yield And Estimated Loss With Prosper Rating')
plt.xticks([0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35], [0, '5%', '10%', '15%', '20%', '25%', '30%', '35%'])
plt.yticks([0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35], [0, '5%', '10%', '15%', '20%', '25%', '30%', '35%'])
plt.savefig("images/multy");
def hist2dgrid(x, y, **kwargs):
""" Quick hack for creating heat maps with seaborn's PairGrid. """
palette = kwargs.pop('color')
plt.hist2d(x, y, bins = [15, 15], cmap = palette, cmin = 0.3)
plt.xticks([0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35], [0, '5%', '10%', '15%', '20%', '25%', '30%', '35%'])
plt.yticks([0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35], [0, '5%', '10%', '15%', '20%', '25%', '30%', '35%']);
g = sb.FacetGrid(data = loan_filter, col = 'ProsperRating', col_wrap=4)
g.map(hist2dgrid, 'EstimatedLoss', 'LenderYield', color = 'plasma_r')
g.set_xlabels('Estimated Loss')
g.set_ylabels('Lender Yield');
we can see that when the prosper rating is higher, the color in the heat map gets closer to 0 in the 2 axis.
fig = plt.figure(figsize = [6,6], dpi=150)
ax = sb.pointplot(data = loan_filter, x = 'ProsperRating', y = 'LenderYield', hue = 'IncomeRange',
palette = 'Blues', linestyles = '', dodge = 0.4)
plt.title('Lender Yield Across Prosprer Raitind And Clarity');
in the first plot we can see that when we have a lower Prosper rating the lender yield and estimated loss are higher.